In [1]:
from pymongo import MongoClient
from datetime import datetime, date, timedelta, time
from dateutil import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.basemap import Basemap
import pymysql
matplotlib.style.use('ggplot')
In [2]:
def toUnix(datetime):
unix = datetime.strftime('%s')
return unix
def convertDatetime(unix):
dt = datetime.fromtimestamp(unix)
return dt.strftime('%Y-%m-%d %H:%M:%S')
def convertDate(unix):
d = date.fromtimestamp(unix)
return d.strftime('%Y-%m-%d')
In [3]:
class RailDatabase():
def __init__(self, isNew):
if isNew:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='newrailDB')
else:
self.conn = pymysql.connect(host='localhost', port=3306, user='jnevens', passwd='Panda85?', db='oldrailDB')
def getAllRoutes(self, date):
C = self.conn.cursor()
C.execute('SELECT route_id FROM route WHERE date = %s', (date,))
rows = C.fetchall()
C.close()
return rows
def getStops(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s', (routeID,))
rows = C.fetchall()
C.close()
return rows
def getLastStop(self, routeID):
C = self.conn.cursor()
C.execute('SELECT * FROM stop WHERE route_id = %s ORDER BY arrival_datetime', (routeID,))
rows = C.fetchall()
last = rows[-1:]
C.close()
return last
def getAllStations(self):
C = self.conn.cursor()
C.execute('SELECT * FROM station')
rows = C.fetchall()
C.close()
return rows
class TweetDatabase():
def __init__(self, isNew):
self.conn = MongoClient().data_science
self.new = isNew
def connect(self):
if self.new:
return self.conn.new_tweets
else:
return self.conn.old_tweets
class Config():
def __init__(self, isNew):
self.new = isNew
##EQUAL PERIOD BEFORE AND AFTER NEW SCHEDULE##
def period(self):
if self.new:
return (date(2014, 10, 27), date(2015, 2, 1))
else:
return (date(2014, 10, 27), date(2014, 12, 14))
In [4]:
def tweetsPerDay(isNew):
conf = Config(isNew)
start, stop = conf.period()
period = pd.date_range(start, stop)
#Tweets Per Day
tpd = pd.Series(0, index = period)
db = TweetDatabase(False).connect()
oldTweets = db.find()
for tweet in oldTweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
create_date = create_datetime.date()
d = create_date.isoformat()
tpd[d] += 1
db = TweetDatabase(True).connect()
newTweets = db.find()
for tweet in newTweets:
create_datetime = tweet['created_at']
create_datetime = parser.parse(create_datetime, ignoretz = True)
create_date = create_datetime.date()
if create_date <= stop:
d = create_date.isoformat()
tpd[d] += 1
return tpd
In [6]:
tpd = tweetsPerDay(True)
fig = plt.figure()
plt.bar(np.arange(tpd.size), tpd, color='b')
plt.axvline(49, color='r')
plt.title('Tweets per day')
plt.xlabel('Date')
plt.ylabel('Amount of Tweets')
plt.ylim(ymax=1800)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_tweets.pdf')
plt.show()
In [9]:
def delaysPerDay(isNew):
conf = Config(isNew)
start, stop = conf.period()
limit = date(2014, 12, 14)
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
#DataFrame with worst-case and avg delay on entire network, per day
#Delay is computed in a worst-case scenario, i.e. max delay of a train
#divided by the amount of trains on that day.
#In avg-case scenario, i.e. avg of avg of arrival and avg of departure delay
#divided by the amount of trains on that day.
delays = pd.DataFrame(zeros, index = period, columns = ['Worst case', 'Avg case'])
while(start <= stop):
n = start > limit
db = RailDatabase(n)
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
#Instead of keeping a list of worst_delays {which is O(n) in memory},
#the worst_delay is accumulated and divided by the amount of data points {which is O(1) in memory}
total_worst_delay = 0
worst_delay_count = 0
total_avg_delay = 0
avg_delay_count = 0
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
max_delay = 0
total_arrival_delay = 0
arrival_delay_count = 0
total_departure_delay = 0
departure_delay_count = 0
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
total_arrival_delay += arrival_delay
arrival_delay_count += 1
if departure_detected:
total_departure_delay += departure_delay
departure_delay_count += 1
if max(arrival_delay, departure_delay) > max_delay:
max_delay = max(arrival_delay, departure_delay)
total_worst_delay += max_delay
worst_delay_count += 1
if arrival_delay_count == 0:
arrival_delay_count = 1
if departure_delay_count == 0:
departure_delay_count = 1
avg_arrival = float(total_arrival_delay) / float(arrival_delay_count)
avg_departure = float(total_departure_delay) / float(departure_delay_count)
total_avg_delay += np.mean([avg_arrival, avg_departure])
avg_delay_count += 1
key = start.isoformat()
if worst_delay_count == 0:
worst_delay_count = 1
if avg_delay_count == 0:
avg_delay_count = 1
delays['Worst case'][key] = float(total_worst_delay) / float(worst_delay_count)
delays['Avg case'][key] = float(total_avg_delay) / float(avg_delay_count)
delta = timedelta(days=1)
start = start + delta
return delays
In [10]:
delays = delaysPerDay(True)
rolling = pd.rolling_mean(delays, 4, center=True)
ax_delays = delays.plot(x_compat=True, style='--', color=['r', 'b'])
rolling.plot(color=['r','b'], ax=ax_delays, legend=0)
plt.axvline('2014-12-14', color='g')
plt.title('Delays per day on entire network')
plt.xlabel('Date')
plt.ylabel('Minutes')
plt.savefig('./../../Paper/plots/equal_period_delays.png')
In [8]:
def cancelsPerDay(isNew):
conf = Config(isNew)
start, stop = conf.period()
limit = date(2014, 12, 14)
period = pd.date_range(start, stop)
#Series with the amount of cancelled trains, per day
#A train is considered cancelled when arrival is not detected at its final stop
cancels = pd.Series(0, index = period)
while(start <= stop):
n = start > limit
db = RailDatabase(n)
t = time(0, 0, 0)
dt = datetime.combine(start, t)
key = start.isoformat()
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
for routeRow in routes:
routeID = routeRow[0]
lastStop = db.getLastStop(routeID)
for stopRow in lastStop:
arrival_detected = stopRow[3]
if not arrival_detected:
cancels[key] += 1
delta = timedelta(days=1)
start = start + delta
return cancels
In [9]:
cancels = cancelsPerDay(True)
fig = plt.figure()
plt.bar(np.arange(98), cancels, color='b')
plt.axvline(49, color='r')
plt.title('Cancelled trains per day')
plt.xlabel('Date')
plt.ylabel('Cancelled trains')
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_cancels.pdf')
plt.show()
In [6]:
def percentageDelays(isNew):
conf = Config(isNew)
start, stop = conf.period()
diff = stop - start
period = pd.date_range(start, stop)
zeros = np.zeros((diff.days+1, 2))
limit = date(2014, 12, 14)
#DataFrame with the number of delayed trains per day
#A train is considered delayed when it suffers a delay of more than 5 minutes at any stop
#The 2nd column is the percentage of delayed trains on that day.
perctDelays = pd.DataFrame(zeros, index = period, columns = ['Delayed', 'Percentage'])
while(start <= stop):
n = start > limit
db = RailDatabase(n)
t = time(0, 0, 0)
dt = datetime.combine(start, t)
#Get all routes for a specified date.
#Every trainID rides only once each day, so there is no point in asking all the trainIDs first
routes = db.getAllRoutes(toUnix(dt))
key = start.isoformat()
count = len(routes)
#Get all stops for the route
for routeRow in routes:
routeID = routeRow[0]
stops = db.getStops(routeID)
#If a stop is found in the route, with > 5min delay
#then this train is considered delayed!
for stopRow in stops:
arrival_detected = stopRow[3]
departure_detected = stopRow[6]
arrival_delay = stopRow[2]
departure_delay = stopRow[5]
if arrival_detected:
if arrival_delay > 5:
perctDelays['Delayed'][key] += 1
break
elif departure_detected:
if departure_delay > 5:
perctDelays['Delayed'][key] += 1
break
perctDelays['Percentage'][key] = (float(perctDelays['Delayed'][key]) / float(count)) * float(100)
delta = timedelta(days=1)
start = start + delta
return perctDelays
In [7]:
perctDelays = percentageDelays(True)
fig = plt.figure()
plt.bar(perctDelays.index, perctDelays['Percentage'], color='b')
plt.axvline('2014-12-15', color='r')
plt.title('Percentage of delayed trains per day')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.ylim(ymax=60)
fig.autofmt_xdate()
plt.savefig('./plots/equal_period_percentage_delays.pdf')
plt.show()
In [ ]: